<!DOCTYPE html>
<html lang="" xml:lang="">
  <head>
    <title>Text analysis</title>
    <meta charset="utf-8" />
    <meta name="author" content="datasciencebox.org" />
    <script src="libs/header-attrs/header-attrs.js"></script>
    <link href="libs/font-awesome/css/all.css" rel="stylesheet" />
    <link href="libs/font-awesome/css/v4-shims.css" rel="stylesheet" />
    <link href="libs/panelset/panelset.css" rel="stylesheet" />
    <script src="libs/panelset/panelset.js"></script>
    <link rel="stylesheet" href="../xaringan-themer.css" type="text/css" />
    <link rel="stylesheet" href="../slides.css" type="text/css" />
  </head>
  <body>
    <textarea id="source">
class: center, middle, inverse, title-slide

.title[
# Text analysis
]
.subtitle[
## <br><br> Data Science in a Box
]
.author[
### <a href="https://datasciencebox.org/">datasciencebox.org</a>
]

---





layout: true
  
&lt;div class="my-footer"&gt;
&lt;span&gt;
&lt;a href="https://datasciencebox.org" target="_blank"&gt;datasciencebox.org&lt;/a&gt;
&lt;/span&gt;
&lt;/div&gt; 

---



class: middle

# Tidytext analysis

---

## Tidytext

.pull-left[
- Using tidy data principles can make many text mining tasks easier, more effective, and consistent with tools already in wide use
- Learn more at [tidytextmining.com](https://www.tidytextmining.com/)
]
.pull-right[
&lt;img src="img/tidytext.png" width="60%" style="display: block; margin: auto auto auto 0;" /&gt;
]

---

## What is tidy text?

.pull-left-wide[
.small[

```r
text &lt;- c("Oh! Get me away from here, I'm dying",
          "Play me a song to set me free",
          "Nobody writes them like they used to",
          "So it may as well be me",
          "Here on my own now after hours",
          "Here on my own now on a bus",
          "Think of it this way",
          "You could either be successful or be us",
          "With our winning smiles, and us",
          "With our catchy tunes or worse",
          "Now we're photogenic",
          "You know, we don't stand a chance")

text
```

```
##  [1] "Oh! Get me away from here, I'm dying"   
##  [2] "Play me a song to set me free"          
##  [3] "Nobody writes them like they used to"   
##  [4] "So it may as well be me"                
##  [5] "Here on my own now after hours"         
##  [6] "Here on my own now on a bus"            
##  [7] "Think of it this way"                   
##  [8] "You could either be successful or be us"
##  [9] "With our winning smiles, and us"        
## [10] "With our catchy tunes or worse"         
## [11] "Now we're photogenic"                   
## [12] "You know, we don't stand a chance"
```
]
]

---

## What is tidy text?


```r
text_df &lt;- tibble(line = 1:12, text = text)

text_df %&gt;% print(n = 12)
```

```
## # A tibble: 12 × 2
##     line text                                   
##    &lt;int&gt; &lt;chr&gt;                                  
##  1     1 Oh! Get me away from here, I'm dying   
##  2     2 Play me a song to set me free          
##  3     3 Nobody writes them like they used to   
##  4     4 So it may as well be me                
##  5     5 Here on my own now after hours         
##  6     6 Here on my own now on a bus            
##  7     7 Think of it this way                   
##  8     8 You could either be successful or be us
##  9     9 With our winning smiles, and us        
## 10    10 With our catchy tunes or worse         
## 11    11 Now we're photogenic                   
## 12    12 You know, we don't stand a chance
```

---

## What is tidy text?


```r
text_df %&gt;%
  unnest_tokens(word, text) %&gt;%
  print(n = 10)
```

```
## # A tibble: 80 × 2
##     line word 
##    &lt;int&gt; &lt;chr&gt;
##  1     1 oh   
##  2     1 get  
##  3     1 me   
##  4     1 away 
##  5     1 from 
##  6     1 here 
##  7     1 i'm  
##  8     1 dying
##  9     2 play 
## 10     2 me   
## # … with 70 more rows
```

---

## Case study: FM's COVID-19 speeches

&lt;br&gt;

.title-slide[
### [github.com/mine-cetinkaya-rundel/fm-speeches-covid19](https://github.com/mine-cetinkaya-rundel/fm-speeches-covid19)
]
    </textarea>
<style data-target="print-only">@media screen {.remark-slide-container{display:block;}.remark-slide-scaler{box-shadow:none;}}</style>
<script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
<script>var slideshow = remark.create({
"ratio": "16:9",
"highlightLines": true,
"highlightStyle": "solarized-light",
"countIncrementalSlides": false
});
if (window.HTMLWidgets) slideshow.on('afterShowSlide', function (slide) {
  window.dispatchEvent(new Event('resize'));
});
(function(d) {
  var s = d.createElement("style"), r = d.querySelector(".remark-slide-scaler");
  if (!r) return;
  s.type = "text/css"; s.innerHTML = "@page {size: " + r.style.width + " " + r.style.height +"; }";
  d.head.appendChild(s);
})(document);

(function(d) {
  var el = d.getElementsByClassName("remark-slides-area");
  if (!el) return;
  var slide, slides = slideshow.getSlides(), els = el[0].children;
  for (var i = 1; i < slides.length; i++) {
    slide = slides[i];
    if (slide.properties.continued === "true" || slide.properties.count === "false") {
      els[i - 1].className += ' has-continuation';
    }
  }
  var s = d.createElement("style");
  s.type = "text/css"; s.innerHTML = "@media print { .has-continuation { display: none; } }";
  d.head.appendChild(s);
})(document);
// delete the temporary CSS (for displaying all slides initially) when the user
// starts to view slides
(function() {
  var deleted = false;
  slideshow.on('beforeShowSlide', function(slide) {
    if (deleted) return;
    var sheets = document.styleSheets, node;
    for (var i = 0; i < sheets.length; i++) {
      node = sheets[i].ownerNode;
      if (node.dataset["target"] !== "print-only") continue;
      node.parentNode.removeChild(node);
    }
    deleted = true;
  });
})();
// add `data-at-shortcutkeys` attribute to <body> to resolve conflicts with JAWS
// screen reader (see PR #262)
(function(d) {
  let res = {};
  d.querySelectorAll('.remark-help-content table tr').forEach(tr => {
    const t = tr.querySelector('td:nth-child(2)').innerText;
    tr.querySelectorAll('td:first-child .key').forEach(key => {
      const k = key.innerText;
      if (/^[a-z]$/.test(k)) res[k] = t;  // must be a single letter (key)
    });
  });
  d.body.setAttribute('data-at-shortcutkeys', JSON.stringify(res));
})(document);
(function() {
  "use strict"
  // Replace <script> tags in slides area to make them executable
  var scripts = document.querySelectorAll(
    '.remark-slides-area .remark-slide-container script'
  );
  if (!scripts.length) return;
  for (var i = 0; i < scripts.length; i++) {
    var s = document.createElement('script');
    var code = document.createTextNode(scripts[i].textContent);
    s.appendChild(code);
    var scriptAttrs = scripts[i].attributes;
    for (var j = 0; j < scriptAttrs.length; j++) {
      s.setAttribute(scriptAttrs[j].name, scriptAttrs[j].value);
    }
    scripts[i].parentElement.replaceChild(s, scripts[i]);
  }
})();
(function() {
  var links = document.getElementsByTagName('a');
  for (var i = 0; i < links.length; i++) {
    if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) {
      links[i].target = '_blank';
    }
  }
})();
// adds .remark-code-has-line-highlighted class to <pre> parent elements
// of code chunks containing highlighted lines with class .remark-code-line-highlighted
(function(d) {
  const hlines = d.querySelectorAll('.remark-code-line-highlighted');
  const preParents = [];
  const findPreParent = function(line, p = 0) {
    if (p > 1) return null; // traverse up no further than grandparent
    const el = line.parentElement;
    return el.tagName === "PRE" ? el : findPreParent(el, ++p);
  };

  for (let line of hlines) {
    let pre = findPreParent(line);
    if (pre && !preParents.includes(pre)) preParents.push(pre);
  }
  preParents.forEach(p => p.classList.add("remark-code-has-line-highlighted"));
})(document);</script>

<script>
slideshow._releaseMath = function(el) {
  var i, text, code, codes = el.getElementsByTagName('code');
  for (i = 0; i < codes.length;) {
    code = codes[i];
    if (code.parentNode.tagName !== 'PRE' && code.childElementCount === 0) {
      text = code.textContent;
      if (/^\\\((.|\s)+\\\)$/.test(text) || /^\\\[(.|\s)+\\\]$/.test(text) ||
          /^\$\$(.|\s)+\$\$$/.test(text) ||
          /^\\begin\{([^}]+)\}(.|\s)+\\end\{[^}]+\}$/.test(text)) {
        code.outerHTML = code.innerHTML;  // remove <code></code>
        continue;
      }
    }
    i++;
  }
};
slideshow._releaseMath(document);
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
  var script = document.createElement('script');
  script.type = 'text/javascript';
  script.src  = 'https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML';
  if (location.protocol !== 'file:' && /^https?:/.test(script.src))
    script.src  = script.src.replace(/^https?:/, '');
  document.getElementsByTagName('head')[0].appendChild(script);
})();
</script>
  </body>
</html>
